import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd

# url = 'https://whlyj.beijing.gov.cn/zwgk/zxgs/tjxx/history/2023/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'
}


# response = requests.get(url, headers=headers)
# soup = BeautifulSoup(response.text, 'lxml')
# a = soup.find_all('a',attrs={'href':re.compile(r'\./lyqhd/\w+/\w+_\w+.html')})
# prefix = "https://whlyj.beijing.gov.cn/zwgk/zxgs/tjxx/history/2023"
# tourist_area_url_list = [prefix+i['href'][1:] for i in a]
#
# info = []
# columns = []
# for i in tourist_area_url_list:
#     response = requests.get(i, headers=headers)
#     soup = BeautifulSoup(response.text, 'lxml')
#     tr_list = soup.find('tbody').find_all('tr')[1:10]
#     cur_time = tr_list[0].find_all('td')[2]
#     tds_list = [tr.find_all('td') for tr in tr_list[1:]]
#     columns = [tds[0] for tds in tds_list]
#     info += [[f'2023年{cur_time.text}']+[f'{tds[2].text}+{tds[1].text}' for tds in tds_list],[f'2022年{cur_time.text}']+[f'{tds[3].text}+{tds[1].text}' for tds in tds_list]]
#     # time.sleep(1)
# pd.DataFrame(info,columns=['日期']+columns)

def main():
    url = 'https://whlyj.beijing.gov.cn/zwgk/zxgs/tjxx/history/2023/'
    html = req(url)
    url_list = parse_url_data(html)
    data = parse_data(url_list)
    save_data_to_csv(data['info'], data['columns'])


def req(url):
    response = requests.get(url, headers=headers)
    return response.text


def parse_url_data(html):
    soup = BeautifulSoup(html, 'lxml')
    a = soup.find_all('a', attrs={'href': re.compile(r'\./lyqhd/\w+/\w+_\w+.html')})
    prefix = "https://whlyj.beijing.gov.cn/zwgk/zxgs/tjxx/history/2023"
    tourist_area_url_list = [prefix + i['href'][1:] for i in a]
    return tourist_area_url_list


def info_replace(info):
    return info.replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '').replace(' ', '')


def parse_data(url_list):
    info = []
    columns = []
    for i in url_list:
        html = req(i)
        soup = BeautifulSoup(html, 'lxml')
        tr_list = soup.find('tbody').find_all('tr')[1:10]
        cur_time = tr_list[0].find_all('td')[2]
        tds_list = [tr.find_all('td') for tr in tr_list[1:]]
        columns = [f'{info_replace(tds[0].text)}({info_replace(tds[1].text)})' for tds in tds_list]
        info += [
            [f'2023年{info_replace(cur_time.text)}'] + [info_replace(tds[2].text) for tds in tds_list],
            [f'2022年{info_replace(cur_time.text)}'] + [info_replace(tds[3].text) for tds in tds_list]
        ]
        time.sleep(1)
    return {
        'info': info,
        'columns': columns
    }


def save_data_to_csv(data, columns):
    df = pd.DataFrame(data, columns=['日期'] + columns)
    df.to_csv('../static/data/info.csv', index=False, encoding='utf_8_sig')


if __name__ == '__main__':
    main()
